Let us set some global options for all code chunks in this document.

knitr::opts_chunk$set(
  message = FALSE,    # Disable messages printed by R code chunks
  warning = FALSE,    # Disable warnings printed by R code chunks
  echo = TRUE,        # Show R code within code chunks in output
  include = TRUE,     # Include both R code and its results in output
  eval = TRUE,       # Evaluate R code chunks
  cache = FALSE,       # Enable caching of R code chunks for faster rendering
  fig.align = "center",
  out.width = "100%",
  retina = 2,
  error = TRUE,
  collapse = TRUE
)
rm(list = ls())
set.seed(1982)

1 PREPROCESSING

Let us now load some required libraries.

# Load required libraries

# inla.upgrade(testing = TRUE)
# remotes::install_github("inlabru-org/inlabru", ref = "devel")
# remotes::install_github("davidbolin/rspde", ref = "devel")
# remotes::install_github("davidbolin/metricgraph", ref = "devel")
# remotes::install_github("davidbolin/ngme2", ref = "devel")

library(INLA)
#inla.setOption(num.threads = 7)
library(inlabru)
library(rSPDE)
library(MetricGraph)
library(ngme2)

library(plotly)
library(dplyr)

library(sf)

library(here)

Function standarize() below is later used to standardize the covariate SpeedLimit.

standardize <- function(x) {return((x - mean(x)) / sd(x))}

1.1 PREPARING AND ADDING THE DATA TO THE GRAPH.

We load the graph object sf_graph (which only contains weights) and the data (already graph-processed).

timeallprocedure <- Sys.time()
load(here("Graph_objects/graph_construction_19MAY24_FRC0134.RData"))
load(here("Data_files/data_day7142128_hour13_with_no_consecutive_zeros_19MAY24_FRC0134_graph_processed.RData"))
data_on_graph = data_on_graph %>% 
  dplyr::select(-datetime)

The following commands remove zero speed observations that are 1m away from the graph, and after that, they remove any speed observations that are 3m away from the graph.

to_remove = data_on_graph %>%
  filter(speed == 0, .distance_to_graph > 0.001) 

data_on_graph = setdiff(data_on_graph, to_remove) %>% 
  filter(.distance_to_graph <= 0.003)

We add data to the graph.

sf_graph$add_observations(data = data_on_graph, 
                          group = "day", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

We get the values of the weights at data locations. This essentially gives us covariates from the weights.

sf_graph$edgeweight_to_data(data_loc = TRUE)

When running sf_graph$edgeweight_to_data(data_loc = TRUE), some NA values are created (because the data is grouped). We remove them below. We also standardize the SpeedLimit covariate.

data <- sf_graph$get_data() %>% 
  drop_na(-StreetName) %>% # this drops all rows with at least one NA value but without taking into account StreetName
  mutate(across(c("SpeedLimit"), ~standardize(.))) %>%
  dplyr::select(speed, SpeedLimit)

1.2 NOW WE ADD THE FINAL VERSION OF THE DATA TO THE GRAPH

We add the data again but now with the new standardized SpeedLimit covariate.

sf_graph$add_observations(data = data, 
                          group = "day", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

1.3 GET THE OBSERVATION LOCATIONS (FROM THE (FINAL VERSION OF THE) DATA EXTRACTED FROM THE GRAPH)

data_final <- sf_graph$get_data()
obs.loc <- data_final |> as.data.frame() |> dplyr::select(.edge_number, .distance_on_edge)

1.4 BUILD THE MESH

h = 0.05
sf_graph$build_mesh(h = h)

1.5 GET THE VALUES OF THE COVARIATE ON THE MESH LOCATIONS

We get the value of the weights at mesh locations. This will allow us to built matrices B.sigma and B.range below. Again, sf_graph$edgeweight_to_data(mesh = TRUE, add = FALSE, return = TRUE) creates repeated information (because the data is grouped). We fix that by filtering one group. We also standardize the SpeedLimit covariate.

mesh <- sf_graph$edgeweight_to_data(mesh = TRUE, 
                                   add = FALSE, 
                                   return = TRUE) %>% 
  filter(.group == 1) %>%
  mutate(across(c("SpeedLimit"), ~standardize(.))) %>%
  dplyr:::select.data.frame(SpeedLimit)

1.6 GET THE PROJECTION MATRIX (FROM THE MESH TO THE OBSERVATION LOCATIONS)

#mesh.loc <- sf_graph$mesh$VtE
A.proj <- sf_graph$fem_basis(obs.loc)

2 STATIONARY MODEL

2.1 SIMULATE THE STATIONARY FIELD

  • Observe that we are considering replicates.
stat.time.ini <- Sys.time()

sigma <- 1.3
range <- 0.15
nu <- 0.5
sigma.e <- 0.1
rspde.order <- 1

op <- matern.operators(nu = nu, 
                       range = range, 
                       sigma = sigma, 
                       parameterization = "matern",
                       m = rspde.order, 
                       graph = sf_graph)

u_stat = simulate(op)

2.2 SIMULATE THE STATIONARY RESPONSE

speed_sim <- 1 + 2*data_final$SpeedLimit + as.vector(A.proj %*% u_stat + sigma.e * rnorm(nrow(data_final)))

2.3 CREATE A NEW DATASET (THE ONLY THING THAT CHANGES IS THE SIMULATED STATIONARY RESPONSE)

data_sim_stat <- data_final %>% 
  mutate(speed = speed_sim)

2.4 ADD THE DATA WITH SIMULATED RESPONSE (USING STATIONARY FIELD)

sf_graph$add_observations(data = data_sim_stat, 
                          group = ".group", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

2.5 FIT THE STATIONARY MODEL

rspde_model_stat <- rspde.metric_graph(sf_graph,
                                         parameterization = "matern",
                                         nu = nu)

data_rspde_bru_stat <- graph_data_rspde(rspde_model_stat,
                                        repl = ".all",
                                        loc_name = "loc")
cmp_stat = speed ~ -1 +
  Intercept(1) +
  SpeedLimit +
  field(loc, model = rspde_model_stat,
        replicate = data_rspde_bru_stat[["repl"]])

rspde_fit_stat <-
  bru(cmp_stat,
      data = data_rspde_bru_stat[["data"]],
      family = "gaussian",
      options = list(verbose = FALSE)
  )

2.6 SHOW THE RESULTS FOR THE STATIONARY MODEL

stat.time.fin <- Sys.time()
print(stat.time.fin - stat.time.ini)
## Time difference of 4.166325 mins
summary(rspde_fit_stat)
## inlabru version: 2.10.1.9007
## INLA version: 24.05.18-2
## Components:
## Intercept: main = linear(1), group = exchangeable(1L), replicate = iid(1L)
## SpeedLimit: main = linear(SpeedLimit), group = exchangeable(1L), replicate = iid(1L)
## field: main = cgeneric(loc), group = exchangeable(1L), replicate = iid(data_rspde_bru_stat[["repl"]])
## Likelihoods:
##   Family: 'gaussian'
##     Data class: 'metric_graph_data', 'list'
##     Predictor: speed ~ .
## Time used:
##     Pre = 1.37, Running = 106, Post = 4.62, Total = 112 
## Fixed effects:
##             mean    sd 0.025quant 0.5quant 0.975quant  mode kld
## Intercept  0.963 0.015      0.933    0.963      0.993 0.963   0
## SpeedLimit 2.011 0.005      2.002    2.011      2.020 2.011   0
## 
## Random effects:
##   Name     Model
##     field CGeneric
## 
## Model hyperparameters:
##                                            mean    sd 0.025quant 0.5quant
## Precision for the Gaussian observations  99.950 1.337     97.357   99.937
## Theta1 for field                          0.244 0.008      0.229    0.244
## Theta2 for field                         -1.906 0.024     -1.953   -1.906
##                                         0.975quant   mode
## Precision for the Gaussian observations     102.62 99.900
## Theta1 for field                              0.26  0.245
## Theta2 for field                             -1.86 -1.905
## 
## Deviance Information Criterion (DIC) ...............: -31770.34
## Deviance Information Criterion (DIC, saturated) ....: 39723.11
## Effective number of parameters .....................: 13880.06
## 
## Watanabe-Akaike information criterion (WAIC) ...: -34017.26
## Effective number of parameters .................: 8821.08
## 
## Marginal log-Likelihood:  -7631.06 
##  is computed 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
fit.rspde = rspde.result(rspde_fit_stat, "field", rspde_model_stat)
summary(fit.rspde)
##            mean         sd 0.025quant 0.5quant 0.975quant     mode
## std.dev 1.27698 0.01022070   1.256980 1.276950   1.297100 1.276920
## range   0.14875 0.00355414   0.141819 0.148722   0.155787 0.148682

3 NONSTATIONARY MODEL

3.1 SIMULATE THE NONSTATIONARY FIELD

nonstat.time.ini <- Sys.time()

B.sigma = cbind(0, 1, 0, mesh$SpeedLimit, 0)
B.range = cbind(0, 0, 1, 0, mesh$SpeedLimit)

cov_nonstat <- rSPDE::spde.matern.operators(graph = sf_graph,
                                                parameterization = "matern",
                                                B.sigma = B.sigma,
                                                B.range = B.range,
                                                theta = c(sigma, range, sigma, range),
                                                nu = nu)$covariance_mesh()
L <- chol(cov_nonstat)
u_nonstat <- t(L)%*%rnorm(dim(cov_nonstat)[1])

3.2 SIMULATE THE NONSTATIONARY RESPONSE

speed_sim_nonstat <- 1 + 2*data_final$SpeedLimit + as.vector(A.proj %*% u_nonstat + sigma.e * rnorm(nrow(data_final)))

3.3 CREATE A NEW DATASET (THE ONLY THING THAT CHANGES IS THE SIMULATED NONSTATIONARY RESPONSE)

data_sim_nonstat <- data_final %>% 
  mutate(speed = speed_sim_nonstat)

3.4 ADD THE DATA WITH SIMULATED RESPONSE (USING NONSTATIONARY FIELD)

sf_graph$add_observations(data = data_sim_nonstat, 
                          group = ".group", 
                          normalized = TRUE, 
                          clear_obs = TRUE)

3.5 FIT THE NONSTATIONARY MODEL

init.vec.theta = c(fit.rspde$summary.log.std.dev$mode, 
                   fit.rspde$summary.log.range$mode, 
                   rep(0, (ncol(B.sigma)-3)))

rspde_model_nonstat <- rspde.metric_graph(sf_graph,
                                          start.theta = init.vec.theta,
                                          theta.prior.mean = init.vec.theta,
                                          B.sigma = B.sigma,
                                          B.range = B.range,
                                          parameterization = "matern",
                                          nu = nu)

data_rspde_bru_nonstat <- graph_data_rspde(rspde_model_nonstat,
                                           repl = ".all",
                                           loc_name = "loc")
cmp_nonstat = speed ~ -1 +
  Intercept(1) +
  SpeedLimit +
  field(loc, model = rspde_model_nonstat,
        replicate = data_rspde_bru_nonstat[["repl"]])

rspde_fit_nonstat <-
  bru(cmp_nonstat,
      data = data_rspde_bru_nonstat[["data"]],
      family = "gaussian",
      options = list(verbose = FALSE)
  )

3.6 SHOW THE RESULTS FOR THE NONSTATIONARY MODEL

nonstat.time.fin <- Sys.time()
print(nonstat.time.fin - nonstat.time.ini)
## Time difference of 18.80146 mins
summary(rspde_fit_nonstat)
## inlabru version: 2.10.1.9007
## INLA version: 24.05.18-2
## Components:
## Intercept: main = linear(1), group = exchangeable(1L), replicate = iid(1L)
## SpeedLimit: main = linear(SpeedLimit), group = exchangeable(1L), replicate = iid(1L)
## field: main = cgeneric(loc), group = exchangeable(1L), replicate = iid(data_rspde_bru_nonstat[["repl"]])
## Likelihoods:
##   Family: 'gaussian'
##     Data class: 'metric_graph_data', 'list'
##     Predictor: speed ~ .
## Time used:
##     Pre = 1.03, Running = 133, Post = 3.12, Total = 137 
## Fixed effects:
##             mean    sd 0.025quant 0.5quant 0.975quant  mode kld
## Intercept  0.999 0.007      0.985    0.999      1.014 0.999   0
## SpeedLimit 1.996 0.004      1.988    1.996      2.004 1.996   0
## 
## Random effects:
##   Name     Model
##     field CGeneric
## 
## Model hyperparameters:
##                                            mean    sd 0.025quant 0.5quant
## Precision for the Gaussian observations 101.198 1.390     98.697  101.125
## Theta1 for field                          1.359 0.035      1.283    1.361
## Theta2 for field                          0.241 0.075      0.079    0.245
## Theta3 for field                          1.414 0.041      1.345    1.410
## Theta4 for field                          0.389 0.083      0.252    0.382
##                                         0.975quant    mode
## Precision for the Gaussian observations    104.151 100.806
## Theta1 for field                             1.421   1.370
## Theta2 for field                             0.374   0.266
## Theta3 for field                             1.506   1.393
## Theta4 for field                             0.572   0.346
## 
## Deviance Information Criterion (DIC) ...............: -32326.09
## Deviance Information Criterion (DIC, saturated) ....: 39304.80
## Effective number of parameters .....................: 13465.09
## 
## Watanabe-Akaike information criterion (WAIC) ...: -34406.89
## Effective number of parameters .................: 8659.64
## 
## Marginal log-Likelihood:  -13416.37 
##  is computed 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
summary(rspde.result(rspde_fit_nonstat, "field", rspde_model_nonstat))
##                   mean        sd 0.025quant 0.5quant 0.975quant     mode
## Theta1.matern 1.358720 0.0352581  1.2833000 1.360660   1.421300 1.370500
## Theta2.matern 0.240648 0.0753328  0.0794756 0.244791   0.374320 0.265849
## Theta3.matern 1.413980 0.0414072  1.3452700 1.410410   1.505700 1.392620
## Theta4.matern 0.388852 0.0827963  0.2516050 0.381654   0.572326 0.345873

4 CROSS-VALIDATION

#load(here("Models_output/distmatrixfixed.RData"))

points = data_final %>%
  as.data.frame() %>%
  mutate(., index = 1:nrow(.)) %>% 
  dplyr:::select(speed, .group, index) %>%
  mutate(.group = as.numeric(.group)) %>%
  group_by(.group) %>%
  mutate(indexingroup = seq_len(n())) %>%
  ungroup()

distance = seq(from = 0, to = 200, by = 20)/1000

The code of chunk below was executed only one time.


{r}
load(here("Models_output/distmatrixfixed_19May24.RData"))

points = data_final %>%
  as.data.frame() %>%
  mutate(., index = 1:nrow(.)) %>% 
  dplyr:::select(speed, .group, index) %>%
  mutate(.group = as.numeric(.group)) %>%
  group_by(.group) %>%
  mutate(indexingroup = seq_len(n())) %>%
  ungroup()

distance = seq(from = 0, to = 200, by = 20)/1000

GROUPS <- list()
for (j in 1:length(distance)) {
  print(j)
  GROUPS[[j]] = list()
  for (i in 1:nrow(points)) {
    rowi = points[i, ]
    which.in.group <- which(as.vector(distmatrixlist[[rowi$.group]][rowi$indexingroup,]) <= distance[j])
    GROUPS[[j]][[i]] <- filter(points, .group == rowi$.group)[which.in.group, ]$index
  }
}
save(GROUPS, file = here("Models_output/GROUPS_19May24_corrected.RData"))
load(here("Models_output/GROUPS_19May24_corrected.RData"))

The code of chunk above was executed only one time.


ggplot(data_final, aes(x = .coord_x, y = .coord_y, color = .group)) +
  geom_point(size = 0.1) +
  facet_wrap(~ .group) +
  theme_minimal() +
  labs(title = "Scatter Plot of Points by Groups",
       x = "X-axis Label",
       y = "Y-axis Label")

indexofinterest <- 20000
pointofinterest <- GROUPS[[11]][[indexofinterest]]
windowofinterest <- data_final[pointofinterest, ] |> as.data.frame() |> st_as_sf(coords = c(".coord_x", ".coord_y"), crs = 4326)

bbox <- st_bbox(windowofinterest)

polygon <- st_polygon(list(rbind(
  c(bbox["xmin"], bbox["ymin"]),
  c(bbox["xmax"], bbox["ymin"]),
  c(bbox["xmax"], bbox["ymax"]),
  c(bbox["xmin"], bbox["ymax"]),
  c(bbox["xmin"], bbox["ymin"])
))) |>
st_sfc(crs = st_crs(windowofinterest))

data_final_filtered <- data_final %>% 
  as.data.frame() %>%
  st_as_sf(coords = c(".coord_x", ".coord_y"), crs = 4326) %>%
  st_filter(x = ., y = polygon, .predicate = st_within)

ggplot() +
  geom_sf(data = data_final_filtered, aes(color = .group))


ggplot() +
  geom_sf(data = data_final_filtered, aes(color = .group)) +
  geom_point(data = data_final[pointofinterest, ], aes(x = .coord_x, y = .coord_y), color = "blue") +
  geom_point(data = data_final[indexofinterest, ], aes(x = .coord_x, y = .coord_y), color = "red")


ggplot() +
  geom_point(data = data_final[pointofinterest, ], aes(x = .coord_x, y = .coord_y), color = "blue") +
  geom_point(data = data_final[indexofinterest, ], aes(x = .coord_x, y = .coord_y), color = "red")


ggplot() +
  geom_point(data = filter(data_final, .group == data_final[indexofinterest, ]$.group), aes(x = .coord_x, y = .coord_y), color = "black") +
  geom_point(data = data_final[pointofinterest, ], aes(x = .coord_x, y = .coord_y), color = "blue") +
  geom_point(data = data_final[indexofinterest, ], aes(x = .coord_x, y = .coord_y), color = "red")

mse.stat <- mse.nonstat <- ls.stat <- ls.nonstat <- rep(0,length(distance))
# cross-validation for-loop
for (j in 1:length(distance)) {
  print(j)
  # cross-validation of the stationary model
  cv.stat <- inla.group.cv(rspde_fit_stat, groups = GROUPS[[j]])
  # cross-validation of the nonstationary model
  cv.nonstat <- inla.group.cv(rspde_fit_nonstat, groups = GROUPS[[j]])
  # obtain MSE and LS
  mse.stat[j] <- mean((cv.stat$mean - data_sim_stat$speed)^2)
  mse.nonstat[j] <- mean((cv.nonstat$mean - data_sim_nonstat$speed)^2)
  ls.stat[j] <- mean(log(cv.stat$cv))
  ls.nonstat[j] <- mean(log(cv.nonstat$cv))
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11

## plot results
par(mfrow = c(2,2), family = "Palatino")

# Plot MSE
plot(distance, mse.stat, main = "MSE", ylim = c(min(mse.nonstat, mse.stat), max(mse.nonstat, mse.stat)),
     type = "l", ylab = "MSE", xlab = "distance in m", col = "black")
lines(distance, mse.nonstat, col = "blue")
legend("bottomright", legend = c("Stationary", "Non-stationary"), col = c("black", "blue"), lty = 1)

# Plot log-score
plot(distance, -ls.stat, main = "log-score", ylim = c(min(-ls.nonstat, -ls.stat), max(-ls.nonstat, -ls.stat)),
     type = "l", ylab = "log-score", xlab = "distance in m", col = "black")
lines(distance, -ls.nonstat, col = "blue")
legend("bottomright", legend = c("Stationary", "Non-stationary"), col = c("black", "blue"), lty = 1)

finaltimeallprocedure <- Sys.time()
print(finaltimeallprocedure - timeallprocedure)
## Time difference of 28.4451 mins
save.image(here(paste0("Models_output/", rmarkdown::metadata$title, ".RData")))